Problem Statement: To check the quality of air using ‘Air Quality Chemical Multisensor Device’ by finding the R^2 score or coefficient of regression using different regression models and the best model is selected to evaluate the Air Quality.
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
# READ THE DATA
AQ = pd.read_excel("AirQuality.xlsx")
# DISPLAY THE DATA
AQ.head()
#CHECKING SHAPE OF DATA
AQ.shape
#INFORMATION OF DATA
AQ.info()
#CHECKING FOR NULL VALUES
AQ.isnull().any()
#SETTING DATE AS AN INDEX
AQ.set_index("Date", inplace=True)
AQ.index = pd.to_datetime(AQ.index)
type(AQ.index)
#CONVERTING HH:MM:SS INTO HH FORMAT
AQ['Time'] = pd.to_datetime(AQ['Time'],format = '%H:%M:%S').dt.hour
AQ.head(3)
#DATA DESCRIPTION
AQ.describe()
#COUNT OF -200 VALUE
AQ[AQ['CO(GT)']==-200].count()
#REPLACING THE NEGATIVE VALUES.
AQ.replace(to_replace= -200, value= np.NaN, inplace= True)
#CHECKING NULL VALUES.
AQ.isnull().any()
AQ.isnull().sum()
#YELLOW COLOR IN A PLOT SHOWS THE NULL VALUE
plt.figure(figsize=(8,6))
sns.heatmap(AQ.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()
AQ.shape
#REPLACING NULL VALUES WITH MEDIAN
AQ.fillna(AQ.median(), inplace=True)
sns.heatmap(AQ.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()
# GENERATING CORRELATION MATRIX
corr = AQ.corr()
# PRINT THE CORRELATION MATRIX
corr
plt.figure(figsize=(10, 5))
# plotting the heat map
# corr: give the correlation matrix
# cmap: colour code used for plotting
# vmax: gives maximum range of values for the chart
# vmin: gives minimum range of values for the chart
# annot: prints the correlation values in the chart
# annot_kws: Sets the font size of the annotation
sns.heatmap(corr, cmap='YlGnBu', vmax=1.0, vmin=-1.0, annot = True, annot_kws={"size": 12})
# specify name of the plot
plt.title('Correlation Matrix')
plt.show()
# CREATE A BOXPLOT FOR ALL NUMERIC FEATURES.
plt.figure(figsize=(15, 5))
AQ.boxplot(['PT08.S1(CO)','PT08.S2(NMHC)','NOx(GT)','PT08.S3(NOx)','PT08.S4(NO2)','PT08.S5(O3)','NMHC(GT)'])
plt.show()
plt.figure(figsize=(15, 5))
AQ.boxplot(['CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH'])
plt.show()
sns.pairplot(AQ)
#SCATTER PLOT OF SELECTED FEATURES
#SELECTED FEATURES ARE--'PT08.S1(CO)','PT08.S2(NMHC)','NOx(GT)','PT08.S3(NOx)','PT08.S4(NO2)','PT08.S5(O3)','NMHC(GT)',
#'CO(GT)', 'C6H6(GT)'
sns.set_style('whitegrid')
eda_AQ = AQ.drop(['Time','RH','AH','T'], axis=1)
sns.pairplot(eda_AQ)
#HISTOGRAM OF EVERY FEATURE.
AQ.hist(figsize = (20,20))
plt.show()
CONCLUSION FROM HISTROGRAM : From the histogram, we can observe the variability of each attribute. Also we can observe the skewness of data.
The distplot shows the distribution of a univariate set of observations.
plt.title("True hourly averaged concentration CO in mg/m^3 Distribution")
sns.distplot(AQ['CO(GT)'])
plt.title("PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted) Distribution")
sns.distplot(AQ['PT08.S1(CO)'])
plt.title("True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 Distribution")
sns.distplot(AQ['PT08.S2(NMHC)'])
plt.title("True hourly averaged Benzene concentration in microg/m^3 Distribution")
sns.distplot(AQ['C6H6(GT)'])
sns.distplot(AQ['T'])
AQ['NOx(GT)'].resample('M').mean().plot(kind='bar', figsize=(18,6))
plt.xlabel('Month')
plt.ylabel('Total Nitrogen Oxides(NOx)')
plt.title("Mean Total Nitrogen Oxides (NOx) Level by Month")
CONCLUSION FROM BARPLOT - We can see that initially, the Nitric Oxide levels are low but as the year pass, the Nitric Oxide level is increased.
plt.figure(figsize=(20,6))
sns.barplot(x='Time',y='NOx(GT)',data=AQ, ci=False)
plt.xlabel('Hours')
plt.ylabel('Total Nitrogen Oxides(NOx)')
plt.title("Mean Total Nitrogen Oxides (NOx) Frequency During Days")
AQ.plot(x='NO2(GT)',y='NOx(GT)', kind='scatter', figsize = (10,6), alpha=0.3)
plt.xlabel('Level of Nitrogen Dioxide')
plt.ylabel('Level of Nitrogen Oxides(NOx)')
plt.title("Mean Total Nitrogen Oxides (NOx) Frequency During Days")
plt.tight_layout();
plt.xlabel("Temperature")
plt.ylabel('Relative Humidity')
plt.xlim(0,40)
plt.title("Relative Humidity vs Temperature-Full")
plt.scatter(AQ['T'],AQ["RH"],marker=".")
plt.xlabel("Temperature in Degree Celsius")
plt.ylabel('Relative Humidity')
plt.xlim(-5,10) #Get or set the x limits of the current axes.
plt.title("Relative Humidity vs Temperature")
plt.scatter(AQ['T'],AQ["RH"],marker=".")
#IN THE DATASET,TAKING C6H6(GT) AS TARGET FEATURE
#BY OBSERVING THE TARGET VALUES WE CAN CONCLUDE THAT, IT IS OF TYPE -- NUMERICAL(REGRESSION)
#SO NEED TO USE REGRESSION ALGORITHMS
#SPLITING THE DATASET INTO TRAIN AND TEST
x = AQ.drop('C6H6(GT)',axis=1)
y = AQ['C6H6(GT)']
x
y
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=62)
Standardizing a dataset involves rescaling the distribution of values so that the mean of observed values is 0 and the standard deviation is 1.
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(x_train)
x_train = std.transform(x_train)
x_test = std.transform(x_test)
K nearest neighbors is a simple algorithm that stores all available cases and predict the numerical target based on a similarity measure (e.g., distance functions).
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
y_pred
df1 = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
df1
#VISUALISING THE RESULT
plt.figure(figsize=(12,6))
plt.scatter(y_test,y_pred,marker='+')
R-squared is a statistical measure that represents the goodness of fit of a regression model. The ideal value for r-square is 1. The closer the value of r-square to 1, the better is the model fitted.
from sklearn.metrics import r2_score
a=r2_score(y_test,y_pred)*100
a
Linear regression finds the linear relationship between the dependent variable and one or more independent variables using a best-fit straight line
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(x_train,y_train)
y_pred = LR.predict(x_test)
y_pred
df2 = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
df2
#VISUALISING THE RESULT(PLOTTING BESTFIT LINE)
plt.figure(figsize=(6,3))
plt.scatter(y_test,y_pred,marker='+')
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
#'tight'-- Set limits just large enough to show all data, then disable further autoscaling.
#Testing data
from sklearn.metrics import r2_score
b = r2_score(y_test,y_pred)*100
b
Decision tree regression observes features of an object and trains a model in the structure of a tree to predict data in the future to produce meaningful continuous output.
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x, y)
y_pred=regressor.predict(x_test)
y_pred
y1_pred=regressor.predict(x_train)
y1_pred
df3 = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
df3
#VISUALIZATION OF RESULT
plt.figure(figsize=(12,6))
plt.scatter(y_test,y_pred,marker='+')
from sklearn.metrics import r2_score
c=r2_score(y_test,y_pred)*100
c
print("r2_score of K-NN Regression is : ",a)
print("r2_score of linear Regression is : ",b)
print("r2_score of Decision Tree Regression is : ",c)
The r2_score is higher for the Decision Tree, so we can conclude that Decision Tree Regression model is best for evaluating the Quality of Air for a given Air Quality Prediction dataset